In [1]:
from __future__ import division
import math
#import matplotlib as mpl ### May need to uncomment these two lines for mac osx users
#mpl.use('TkAgg') ###
from matplotlib import pyplot as plt
plt.style.use('ggplot')
import pandas as pd
import numpy as np
%matplotlib inline
Lets load in the dataset
In [2]:
df = pd.read_csv("data/train.csv")
df.head()
Out[2]:
In [3]:
df.tail()
Out[3]:
Lets see the types that were imported on our behalf
In [4]:
df.dtypes
Out[4]:
In [5]:
df.info()
In [6]:
df.describe()
Out[6]:
See the shape of the dataset
In [7]:
df.shape
Out[7]:
Here we can see the that it has 891 rows of data and 12 attributes worth of imformation.
In [8]:
len(df)
Out[8]:
In [9]:
len(df.columns)
Out[9]:
In [10]:
# where df.columns is
df.columns
Out[10]:
In [11]:
df["Name"]
Out[11]:
In [12]:
my_famous_passenger = df[df["Name"] == "Guggenheim, Mr. Benjamin"]
print(my_famous_passenger)
In [13]:
###Lets get some information about a column
### Like mean age on the boat
df["Age"].mean()
### Fare
df["Fare"].mean()
df["Fare"].describe()
Out[13]:
In [14]:
my_rich_passenger = df[df["Fare"] == 512.3292]
print(my_rich_passenger)
In [15]:
####Lets rearrange some columns. This would be very hard to do using a csv library and doing this by hand.
####Panda allows us to do this very intuitively
cols = list(df.columns.values)
print(cols)
In [16]:
#Using that list above, we can create a new list, with the values rearranged.
cols = ['Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked', 'PassengerId']
new_df = df[cols]
new_df.head()
Out[16]:
In [17]:
#### We can create new dataframes from a few attributes
new_df = df[["Sex","Age"]]
new_df.head()
Out[17]:
In [18]:
df_of_women = df[df["Sex"] == "female"] #### Elaborate and find out what it means to df a df? Look up panda series
df_of_men = df[df["Sex"] == "male"]
In [19]:
df_of_women.head()
Out[19]:
In [20]:
df_of_women.head()
Out[20]:
In [21]:
# Excercise
# Create three data frames. Capture them by passenger class 'PClass'.
# There are three of them. Then figure out the size of each one.
In [22]:
df_pclass_1 = df[df["Pclass"] == 1]
df_pclass_1.head()
df_pclass_1.shape
Out[22]:
In [23]:
df_pclass_2 = df[df["Pclass"] == 2]
df_pclass_2.head()
df_pclass_2.shape
Out[23]:
In [24]:
df_pclass_3 = df[df["Pclass"] == 3]
df_pclass_3.head()
df_pclass_3.shape
Out[24]:
Observations: There were many people in third class. More so than the other two class of passengers combined.
We can create new attributes from other attributes!
In [25]:
df['FamilySize'] = df['SibSp'] + df['Parch']
df.head()
Out[25]:
Since we know that Parch is the number of parents or children onboard, and SibSp is the number of siblings or spouses, we could collect those together as a FamilySize
In [26]:
df["Age"].hist()
Out[26]:
In [27]:
df["Age"].dropna().hist(bins=16, range=(0,80))
Out[27]:
In [28]:
df["Fare"].hist()
Out[28]:
In [29]:
plt.scatter(df['Fare'], df['Survived'])
plt.show()
In [30]:
### Side step
# Lets create a scatter plot
d = {'one' : np.random.rand(10),
'two' : np.random.rand(10)}
print(d)
In [31]:
df_scrap = pd.DataFrame(d)
df_scrap.plot(style=['ro','bx'])
Out[31]:
In [32]:
##### Back to the titanic. So we have our original dataset
df.head()
Out[32]:
In [33]:
#### Lets group them by gender
grouped_by_sex = df.groupby(["Sex"])
grouped_by_sex.describe()
Out[33]:
In [34]:
#### Lets group them by gender
grouped_by_sex_and_pclass = df.groupby(["Sex", "Pclass"])
grouped_by_sex_and_pclass.describe()
Out[34]:
In [35]:
#### Lets group them by gender
grouped_by_sex_and_pclass_survived = df.groupby(["Sex", "Pclass", "Survived"])
grouped_by_sex_and_pclass_survived.describe()
Out[35]:
In [36]:
df.groupby(['Sex', 'Pclass',"Survived"]).count() ### Count of records in each group throughout a dataset
Out[36]:
In [37]:
# Logistic Regression Time!
import statsmodels.api as sm
import pylab as pl
In [39]:
print(df.columns)
In [65]:
# Create a new temp data frame
new_df = df
def gender_to_numeric(x):
if x == "male":
return 0
else:
return 1
In [72]:
new_df['Sex'] = new_df['Sex'].apply(gender_to_numeric)
In [76]:
new_df = new_df[["Survived", "Age","Sex", "Pclass"]]
new_df = new_df.dropna()
train_cols = new_df.columns[1:]
train_cols
logit = sm.Logit(new_df['Survived'], new_df[train_cols])
#Fit the model
result = logit.fit()
In [78]:
print(result.summary())
In [79]:
print(result.conf_int())
In [ ]: